home *** CD-ROM | disk | FTP | other *** search
- /* SwapBlocks ---------------------------------------------
- *
- * Response to Apr 94 MacTech Programmer's Challenge.
- *
- * Object:
- * Exchange contents of two adjacent memory blocks.
- *
- * Redirection:
- * This is an interesting problem, but what would make
- * this guy really useful? As stated, the blocks for
- * the challenge are 4i bytes long and start on 4j
- * aligned addresses. These are special circumstances
- * which apply to Memory Manager blocks, and then, only
- * on 68020 or later cpu's. Memory blocks on the 68000
- * are merely even aligned and even length. Further,
- * this could be a word processor tool for swapping runs
- * of bytes, but we would have to relax the alignment
- * and size restrictions even further to arbitrary address
- * and length since we would almost always be pointing
- * to characters interior to a handle.
- *
- * I have written the routine to give its best
- * performance, subject to a specified minimum enforced
- * alignment and atom size (smallest unit to move).
- * This is controlled at compile time by:
- *
- * typedef long Atom, for len = 4i, addr = 4j,
- * typedef short Atom, for len = 2i, addr = 2j,
- * typedef Byte Atom, for len = any, addr = any.
- *
- * Note- due to an ancient law of portability, preprocessor
- * directives are not allowed to compare enums, types,
- * sizeof()s or anything else that has machine dependency
- * hidden in it. This means you have to #define the
- * AtomSize manually. This is needed to select the proper
- * performance crossover points for that type.
- *
- * But wait there's more...
- * You might not tolerate a 644 byte dedicated word
- * swapper in your text editor, but a 96 byte one
- * might fit. We handle that.
- *
- * You can tailor the routine to your requirements
- * for execution speed vs. code size by setting the
- * JobMode constant according to table:
- *
- * JobMode Buffers MonsterCopies MonsterSwaps
- * ----------------------------------------------
- * Smallest | No No No
- * Small | No No Yes
- * Fast | Yes No Yes
- * Fastest | Yes Yes Yes
- *
- * billKarsh
- *
- */
-
- #pragma options( honor_register, !assign_registers )
-
- /* ---------------------------------------------------- */
- #define Smallest 0
- #define Small 1
- #define Fast 2
- #define Fastest 3
- /* ---------------------------------------------------- */
-
- /* ---------------------------- */
- /* User Selectable Parameters
- /* ---------------------------- */
-
- #define JobMode Fastest
- #define Verify_p1_LowerThan_p2 0
-
- /* Sorry, you must #define your chosen Atom's size by
- hand. The preprocessor won't accept sizeof operators.
- Yuck! The XOvers below vary according to this size,
- so we have to know it.
- */
-
- typedef long Atom;
- #define AtomSize 4
-
- /* ---------------------------------------------------- */
- #if JobMode >= Fast
- # define UseBuffer 1
- #endif
- #if JobMode == Fastest
- # define MonsterCopy 1
- #endif
- #if JobMode >= Small
- # define MonsterSwap 1
- #endif
- /* ---------------------------------------------------- */
- #define Lo3B 0x00ffffff
- /* ---------------------------------------------------- */
- #if AtomSize == 4
- # define FwdXOver 144
- # define BckXOver 120
- # define SwpXOver 44
- #elif AtomSize == 2
- # define FwdXOver 48
- # define BckXOver 44
- # define SwpXOver 32
- #else
- # define FwdXOver 24
- # define BckXOver 20
- # define SwpXOver 12
- #endif
- /* ---------------------------------------------------- */
- #define FwdOp \
- *dst++ = *src++
- /* ---------------------------------------------------- */
- #define BckOp \
- *--pR = *--pL
- /* ---------------------------------------------------- */
- #define SwpOp \
- q = *pL; \
- *pL++ = *pR; \
- *pR++ = q
- /* ---------------------------------------------------- */
- #define Cases3_1( op ) \
- case 3: op; \
- case 2: op; \
- case 1: op
- /* ---------------------------------------------------- */
- #define Cases7_1( op ) \
- case 7: op; \
- case 6: op; \
- case 5: op; \
- case 4: op; \
- Cases3_1( op )
- /* ---------------------------------------------------- */
- #define CalcPasses( bits ) \
- nS /= sizeof(Atom); \
- q = nS & ((1 << bits) - 1); \
- nS >>= bits; \
- ++nS
- /* ---------------------------------------------------- */
- #define Monster( op, cases ) \
- switch( (short)q ) { \
- case 0: \
- while( --nS ) { \
- op; \
- cases( op ); \
- } \
- }
- /* ---------------------------------------------------- */
- #if MonsterCopy == 1
- #define CopyInc( dst, src, n ) \
- nS = n; \
- if( nS > FwdXOver ) { \
- _CopyInc( \
- (Atom*)(dst), (Atom*)(src), nS ); \
- } \
- else { \
- pL = (Atom*)(dst); \
- pR = (Atom*)(src); \
- do { *pL++ = *pR++; } while(nS-=sizeof(Atom)); \
- }
- #else
- #define CopyInc( dst, src, n ) \
- nS = n; \
- pL = (Atom*)(dst); \
- pR = (Atom*)(src); \
- do { *pL++ = *pR++; } while(nS-=sizeof(Atom))
- #endif
- /* ---------------------------------------------------- */
- #if MonsterCopy == 1
- #define CopyDec( dst, src, n ) \
- nS = n; \
- pR = (Atom*)((Byte*)(dst) + nS); \
- pL = (Atom*)((Byte*)(src) + nS); \
- if( nS > BckXOver ) { \
- CalcPasses( 2 ); \
- Monster( BckOp, Cases3_1 ); \
- } \
- else { \
- do { BckOp; } while(nS-=sizeof(Atom)); \
- }
- #else
- #define CopyDec( dst, src, n ) \
- nS = n; \
- pR = (Atom*)((Byte*)(dst) + nS); \
- pL = (Atom*)((Byte*)(src) + nS); \
- do { BckOp; } while(nS-=sizeof(Atom))
- #endif
- /* ---------------------------------------------------- */
- #if MonsterSwap == 1
- #define Swap \
- if( nS > SwpXOver ) { \
- CalcPasses( 3 ); \
- Monster( SwpOp, Cases7_1 ); \
- } \
- else { \
- do { SwpOp; } while(nS-=sizeof(Atom)); \
- }
- #else
- #define Swap \
- do { SwpOp; } while(nS-=sizeof(Atom))
- #endif
- /* ---------------------------------------------------- */
- #define MacroMania true
- /* ---------------------------------------------------- */
-
- #if JobMode == Fastest
- /* _CopyInc -----------------------------------------------
- *
- * Copy specified number of Bytes from src to dst.
- *
- * Addresses are incremented, so src and dst can overlap
- * iff dst <= src.
- *
- */
-
- static void _CopyInc(
- register Atom *dst,
- register const Atom *src,
- register unsigned long nS )
- {
- short q, pad;
-
- CalcPasses( 3 );
- Monster( FwdOp, Cases7_1 );
- }
- #endif
-
- /* SwapBlocks ---------------------------------------------
- *
- */
-
- void SwapBlocks(
- void *p1,
- void *p2,
- void *swapPtr,
- unsigned long size1,
- unsigned long size2,
- unsigned long swapSize )
- {
- register Atom *pL, *pR, *p0;
- register long nL, nR, nS, q;
- Boolean done;
- short pad;
-
- if( !(nL = size1) || !(nR = size2) ) return;
-
- p0 = p1;
-
- /* --------------------------------------------------------
- If you can safely assume that p1 is always lower
- or same as p2, define Verify_p1_LowerThan_p2 = 0.
- (the #if section is not necessary).
-
- If "1" and "2" are simply labels, indicating
- nothing about position in memory of the blocks,
- then you must order them by activating the #if
- section. Define Verify_p1_LowerThan_p2 = 1.
-
- Ordering means comparing addresses, which treats
- them as 32-bit numbers, NO MATTER the current cpu
- addressing mode. If GetMMUMode returns true, we
- are in 32-bit mode - all 32-bits are significant.
-
- In 24-bit mode, when the cpu uses an address to
- load or store something, it totally ignores the
- high-byte of the address. The high-byte may be
- random garbage. In this mode we suppress any
- garbage before comparing by masking it to zero.
- */
-
- #if Verify_p1_LowerThan_p2 == 1
-
- pR = p2;
-
- if( !GetMMUMode() ) {
- p0 = (Atom*)((long)p0 & Lo3B);
- pR = (Atom*)((long)pR & Lo3B);
- }
-
- if( pR < p0 ) {
- q = (long)p0;
- p0 = pR;
- p2 = (Atom*)q;
-
- q = nL;
- nL = nR;
- nR = q;
- }
- #endif
- /* ----------------------------------------------------- */
-
- /* ----------- */
- /* Buffer? */
- /* ----------- */
-
- /* First, make use of buffer if can. This is faster in
- most cases. A notable exception is equal size case
- which is best done in situ (let drop through).
-
- Compare only the smaller size with buffer. If left
- is smaller, we can use post-increment addressing
- which is the faster mode. If right is smaller,
- use pre-decrement mode. We omit seeing if
- right-smaller will work with post-increment mode
- (if left also fits buffer). Preflighting overhead
- swallows us up very quickly.
- */
-
- #if UseBuffer == 1
-
- if( nL < nR ) {
- if( nL <= swapSize ) {
- CopyInc( swapPtr, p0, nL );
- CopyInc( p0, p2, nR );
- CopyInc( (Byte*)p0 + nR, swapPtr, nL );
- return;
- }
- }
- else if( nL > nR ) {
- if( nR <= swapSize ) {
- CopyInc( swapPtr, p2, nR );
- CopyDec( (Byte*)p0 + nR, p0, nL );
- CopyInc( p0, swapPtr, nR );
- return;
- }
- }
- #endif
-
- /* ----------- */
- /* In Situ */
- /* ----------- */
-
- /* This algorithm always does the job, buffer or not.
-
- Find the smaller block. Swap it immediately into
- its final place. Now the larger block is in two
- out-of-order, but contiguous pieces. Wait a minute,
- this is what we started with! The only differences
- are: now the sizes are {smaller, larger - smaller},
- and the start addresses have to keep up with the
- new pieces.
-
- We repeat until the two pieces were the same length.
- In other words, the final swap didn't break anybody
- in two. This can end with sizes larger than Atom-Atom.
- It depends on whether the smaller evenly divides the
- larger.
- */
-
- done = false;
-
- do {
-
- pL = p0;
- pR = p2;
-
- if( nL < nR ) {
- nR = nR - nL;
- pR = (Atom*)((Byte*)pR + nR);
- nS = nL;
- }
- else if( nL > nR ) {
- p0 = (Atom*)((Byte*)pL + nR);
- nL = nL - nR;
- nS = nR;
- }
- else {
- nS = nL;
- done = true;
- }
-
- Swap;
-
- } while( !done );
- }
-